{
 "metadata": {
  "name": "",
  "signature": "sha256:a5e4d86c57516c2ed8987e0cf88fea66f7d3eb5e1129fe4fdc72fc736ef581fa"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Demo of NLTK language modeling\n",
      "-----------------------------\n",
      "\n",
      "- Jacob Eisenstein, September 2014\n",
      "- for Georgia Tech CS 4650 and 7650"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from collections import defaultdict\n",
      "from nltk.probability import FreqDist, ConditionalFreqDist, LidstoneProbDist, MLEProbDist\n",
      "from nltk.model import NgramModel\n",
      "from nltk.tokenize import word_tokenize, wordpunct_tokenize"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 354
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "with open('two-cities.words') as fin:\n",
      "    wordlist = [line.rstrip().lower() for line in fin.readlines()]\n",
      "wordlist = [word for word in wordlist if word.isalpha()]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 336
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "m_mle_1 = NgramModel(1,wordlist,estimator = lambda fdist,bins : MLEProbDist(fdist))\n",
      "m_mle_2 = NgramModel(2,wordlist,estimator = lambda fdist,bins : MLEProbDist(fdist))\n",
      "m_mle_3 = NgramModel(3,wordlist,estimator = lambda fdist,bins : MLEProbDist(fdist))\n",
      "m_mle_4 = NgramModel(4,wordlist,estimator = lambda fdist,bins : MLEProbDist(fdist))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 100
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print ' '.join(m_mle_2.generate(1000)[-20:])\n",
      "print ' '.join(m_mle_3.generate(1000)[-20:])\n",
      "print ' '.join(m_mle_4.generate(1000)[-20:])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "and your message returned madame defarge coming on the rush and mind about new and influence was neither want to\n",
        "the village all the utmost good that i had given information he had plenty of wine that started away in"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "reckoning in her mind the number of public domain and licensed works that can be freely distributed in machine readable"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n"
       ]
      }
     ],
     "prompt_number": 349
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "sent = 'it was the best of times it was the worst of times'.split()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 350
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def testSent(sent,model,order):\n",
      "    words = sent.split()\n",
      "    tot_ll = 0\n",
      "    for i,word in enumerate(words):\n",
      "        try:\n",
      "            context = words[i-order+1:i]\n",
      "            randword = model.choose_random_word(context)\n",
      "            print '%s\\t%.4f\\t\\t%s\\t%.4f'%(word.upper(),model.prob(word,context),randword,model.prob(randword,context))\n",
      "            tot_ll += model.logprob(word,context)\n",
      "        except:\n",
      "            print word.upper()\n",
      "    return tot_ll"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 351
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "testSent('it was the best of times it was the worst of times',m_mle_1,1)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "IT\t0.0147\t\tthe\t0.0582\n",
        "WAS\t0.0125\t\tappearance\t0.0002\n",
        "THE\t0.0582\t\tshould\t0.0010\n",
        "BEST\t0.0003\t\tin\t0.0188\n",
        "OF\t0.0293\t\tyou\t0.0106\n",
        "TIMES\t0.0004\t\tthe\t0.0582\n",
        "IT\t0.0147\t\tgutenberg\t0.0007\n",
        "WAS\t0.0125\t\tyou\t0.0106\n",
        "THE\t0.0582\t\tthat\t0.0138\n",
        "WORST\t0.0001\t\tlost\t0.0003\n",
        "OF\t0.0293\t\twhile\t0.0007\n",
        "TIMES\t0.0004\t\tmore\t0.0019\n"
       ]
      },
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 319,
       "text": [
        "90.80949839134138"
       ]
      }
     ],
     "prompt_number": 319
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "testSent('it was the best of times it was the worst of times',m_mle_2,2)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "IT\n",
        "WAS\t0.1547\t\tis\t0.1129\n",
        "THE\t0.0637\t\twrong\t0.0006\n",
        "BEST\t0.0029\t\tstreets\t0.0053\n",
        "OF\t0.2500\t\tcondition\t0.0250\n",
        "TIMES\t0.0012\t\tthe\t0.2307\n",
        "IT\t0.0588\t\tas\t0.0980\n",
        "WAS\t0.1547\t\ts\t0.0168\n",
        "THE\t0.0637\t\ta\t0.0930\n",
        "WORST\t0.0019\t\tusage\t0.0001\n",
        "OF\t0.4444\t\tof\t0.4444\n",
        "TIMES\t0.0012\t\tmiss\t0.0012\n"
       ]
      },
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 318,
       "text": [
        "57.403650301729485"
       ]
      }
     ],
     "prompt_number": 318
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "testSent('it was the best of times it was the worst of times',m_mle_3,3)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "IT\n",
        "WAS\n",
        "THE\t0.1211\t\tmaking\t0.0031\n",
        "BEST\t0.0265\t\tconsideration\t0.0088\n",
        "OF\t0.4167\t\tof\t0.4167\n",
        "TIMES\t0.2000\t\tit\t0.3000\n",
        "IT\t0.4000\t\ta\t0.2000\n",
        "WAS\t0.6667\t\twas\t0.6667\n",
        "THE\t0.1211\t\ta\t0.1335\n",
        "WORST\t0.0088\t\ttrue\t0.0088\n",
        "OF\t0.5000\t\tof\t0.5000\n",
        "TIMES\t0.1250\t\tthe\t0.3750\n"
       ]
      },
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 317,
       "text": [
        "27.63827783894363"
       ]
      }
     ],
     "prompt_number": 317
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "testSent('it was the best of times it was the worst of times',m_mle_4,4)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "IT\n",
        "WAS\n",
        "THE\n",
        "BEST\t0.0513\t\tage\t0.0513\n",
        "OF\t0.3333\t\tcure\t0.3333\n",
        "TIMES\t0.2000\t\tplayers\t0.1000\n",
        "IT\t0.5000\t\tit\t0.5000\n",
        "WAS\t1.0000\t\twas\t1.0000\n",
        "THE\t1.0000\t\tthe\t1.0000\n",
        "WORST\t0.0256\t\tage\t0.0513\n",
        "OF\t1.0000\t\tof\t1.0000\n",
        "TIMES\t0.1250\t\thealth\t0.1250\n"
       ]
      },
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 352,
       "text": [
        "17.477695033333013"
       ]
      }
     ],
     "prompt_number": 352
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "news_sent = wordpunct_tokenize('The police chief of Ferguson, Missouri, told NBC News Thursday that we\\'ve done an excellent job, but also admitted there was room for improvement ahead of a federal investigation into the city\\'s embattled police department.')\n",
      "news_words = [word.lower() for word in news_sent if word.isalpha()]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 293
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# something is wrong with these results for the higher-order lidstone models at small values of alpha. \n",
      "# the unigram and bigram models match intuition, though\n",
      "lm_lids = dict()\n",
      "for order in range(1,4):\n",
      "    for alpha in [1e-4,1e-2,1e-1,1,10,100]:\n",
      "        lm_lids[(order,alpha)] = NgramModel(order,wordlist,estimator=lambda fdist,bins : LidstoneProbDist(fdist,alpha))\n",
      "        print order, alpha, lm_lids[(order,alpha)].perplexity(news_words)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "1 0.0001 19737.4070845\n",
        "1"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " 0.01 8257.74539295\n",
        "1"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " 0.1 5336.53056025\n",
        "1"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " 1 3444.91393594\n",
        "1"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " 10 2578.1139025\n",
        "1"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " 100 3753.74459044\n",
        "2"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " 0.0001 493.21191832\n",
        "2"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " 0.01 436.66172022\n",
        "2"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " 0.1 420.107469669\n",
        "2"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " 1 473.027845286\n",
        "2"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " 10 823.681599114\n",
        "2"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " 100 1643.01212594\n",
        "3"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " 0.0001 2.55091674337\n",
        "3"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " 0.01 23.9844097975\n",
        "3"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " 0.1 73.8904995231\n",
        "3"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " 1 232.248044167\n",
        "3"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " 10 681.139585281\n",
        "3"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " 100 1490.04973285\n"
       ]
      }
     ],
     "prompt_number": 353
    }
   ],
   "metadata": {}
  }
 ]
}